
rm(list = ls())

library(tidyverse)
library(readxl)
library(writexl)
library(lubridate)


###MOSCOW RESEARCH SITE

fxn_norm = function(x) {(x - min(x, na.rm = T))/(max(x, na.rm = T) - min(x, na.rm = T))}


moscow_raw <- read_excel("MoscowDataRaw.xlsx")


moscow_dta_temp <- moscow_raw %>%
                    select(start_time = V8, end_time = V9, randomid = RandomID,
                           matches("S1|S2", ignore.case = FALSE),
                           matches("G1|G2|G3", ignore.case = FALSE),
                           contains("lottery"), contains("PSM"),
                           job_sect,
                           age, male = gender, class_yr, contains("dep"),
                           gpa = GPA, olympiad, contains("EGE"), region,
                           city_size, contains("religion"), family_income,
                           contains("relatives"), 
                           soc_science_knowledg, device
  )




#PRELIMINARY VARIABLE CONSTRUCTION FOR DICE TASK GAME

#changing variables names of individual virtual dice rolls
#g1.1...g1.30 are the first set of 20 dice rolls (each respondent sees only 20 of these 30 dice profiles)
#g2.1...g2.30 are the second set of 20 dice rolls (each respondent sees only 20 of these 30 dice profiles)

names(moscow_dta_temp) = gsub(pattern = "\\(", replacement = "", x = names(moscow_dta_temp)) 
names(moscow_dta_temp) = gsub(pattern = "\\)", replacement = "", x = names(moscow_dta_temp)) 
names(moscow_dta_temp) = gsub(pattern = "G2v1_T", replacement = "g1.", x = names(moscow_dta_temp)) 
names(moscow_dta_temp) = gsub(pattern = "G2v2_T", replacement = "g2.", x = names(moscow_dta_temp)) 
moscow_dta_temp <- moscow_dta_temp %>% dplyr::rename(g1.25 = G2v1F_T1, g1.26 = G2v1F_T2,  g1.27 = G2v1F_T3,  g1.28 = G2v1F_T4, g1.29 = G2v1F_T5 , g1.30 = G2v1F_T6)
moscow_dta_temp <- moscow_dta_temp %>% dplyr::rename(g2.25 = G2v2F_T1, g2.26 = G2v2F_T2,  g2.27 = G2v2F_T3,  g2.28 = G2v2F_T4, g2.29 = G2v2F_T5 , g2.30 = G2v2F_T6)

# correct guesses are already coded as 1, incorrect guesses as NA; changing here NA to 0
#moscow_dta_temp <- moscow_dta_temp %>% mutate_at(vars(starts_with('g', ignore.case = F), -contains("gpa")), funs(replace_na(., 0)))
#replaced the above line with the line below b/c "funs" deprecated
moscow_dta_temp <- moscow_dta_temp %>% mutate_at(vars(starts_with('g', ignore.case = F), -contains("gpa")), list(~replace_na(., 0)))

# correct_guesses1 is sum of correct guesses in first 20 dice rolls
dice_rolls1 <- moscow_dta_temp %>% select(starts_with('g1.', ignore.case = F))
moscow_dta_temp$correct_guesses1 <- rowSums(dice_rolls1)   

# correct_guesses2 is sum of correct guesses in second 20 dice rolls
dice_rolls2 <- moscow_dta_temp %>% select(starts_with('g2.', ignore.case = F))
moscow_dta_temp$correct_guesses2 <- rowSums(dice_rolls2)  


#PRELIMINARY VARIABLE CONSTRUCTION FOR BRIBERY GAME

moscow_dta_temp$bribe_accept_size1 = with(moscow_dta_temp,
                                          ifelse(G3_bur_br_350 > G3_bur_br_300, 7,
                                                 ifelse(G3_bur_br_300 > G3_bur_br_250, 6,
                                                        ifelse(G3_bur_br_250 > G3_bur_br_200, 5,
                                                               ifelse(G3_bur_br_200 > G3_bur_br_150, 4,
                                                                      ifelse(G3_bur_br_150 > G3_bur_br_100, 3,
                                                                             ifelse(G3_bur_br_100 > G3_bur_br_50, 2,
                                                                                    ifelse(G3_bur_br_50 > 0, 1, 0)))))))
)

#this version removes subjects with non-transitive preferences
moscow_dta_temp$bribe_accept_size2 = with(moscow_dta_temp,
                                          ifelse((G3_bur_br_50 <= G3_bur_br_100 & 
                                                    G3_bur_br_100 <= G3_bur_br_150 & 
                                                    G3_bur_br_150 <= G3_bur_br_200 & 
                                                    G3_bur_br_200 <= G3_bur_br_250 &
                                                    G3_bur_br_250 <= G3_bur_br_300 & 
                                                    G3_bur_br_300 <= G3_bur_br_350) == TRUE, bribe_accept_size1, NA)
)

#PRELIMINARY VARIABLE CONSTRUCTION FOR ACADEMIC DEPARTMENT

moscow_dta_temp <- moscow_dta_temp %>% mutate(dep_text = case_when(
  grepl("бизнес|БИ|Логист|безопас|информ|сратег|менедж|бм|финансового|управлен", dep_TEXT, ignore.case = T) ~ "Bus",
  grepl("между|миров|МЭ|МИЭФ", dep_TEXT, ignore.case = T) ~ "Intl",
  grepl("экон|РЭШ|стат|финансов", dep_TEXT, ignore.case = T) ~ "Econ",
  grepl("интел|комп|Програм|ФКН", dep_TEXT, ignore.case = T) ~ "CS",
  grepl("ВШУ|урбан", dep_TEXT, ignore.case = T) ~ "Urban",
  grepl("гуман|ФГН|истор|Линг|филос", dep_TEXT, ignore.case = T) ~ "Humanities",
  grepl("матем|Мвтем|МИЭМ|ДПМ", dep_TEXT, ignore.case = T) ~ "Math",
  grepl("комм|медиа|дизайн|журнал|реклам", dep_TEXT, ignore.case = T) ~ "Comm",
  grepl("прав|юрис", dep_TEXT, ignore.case = T) ~ "Law",
  grepl("Искусства|УиЭЗ|нет деп|-", dep_TEXT, ignore.case = T) ~ "Other")
)

#OTHER PRELIMINARY CLEANING

moscow_dta_temp <- moscow_dta_temp %>% mutate_at(vars(contains("_work_")), list(~replace_na(., 0)))



#CREATING DATA SET

moscow_dta <- moscow_dta_temp %>% transmute(
                    #DICTATOR GAME
                    donate = G1_donate_1, 
                    #converts to ruble amount
                    donate50 = donate*50,
                    donate_norm = fxn_norm(donate),
                    #DICE TASK GAME
                    correct_guesses1, correct_guesses2,
                    correct_guesses = correct_guesses1 + correct_guesses2,
                    cheat_rate = (1/(1 - (1/6))) * 1/40 * correct_guesses - (1/6)/(1 - (1/6)),
                    #BRIBERY GAME
                    bribe_accept_size1,
                    bribe_accept_size2,
                    bribe_offer_size = G3_citizen_1,
                    bribe_offer = ifelse(bribe_offer_size > 0, 1, 0),
                    bribe_accept = ifelse(bribe_accept_size1 > 0, 1, 0),
                    bribe = ifelse(is.na(bribe_offer) == FALSE, bribe_offer, bribe_accept),
                    citizen_nobribe = G3_citizen_nobribe,
                    bur_nobribe = G3_bur_nobribe,
                    #RISK AVERSION MEASURE
                    risk_averse1 = 8 - (lottery_1 + lottery_2 + lottery_3 + lottery_4 + lottery_5 + lottery_6 + lottery_7),
                    risk_averse_norm = fxn_norm(risk_averse1),
                    #this version removes subjects with non-transitive preferences
                    risk_averse2 = ifelse((lottery_1 <= lottery_2 & 
                                             lottery_2 <= lottery_3 & 
                                             lottery_3 <= lottery_4 &
                                             lottery_4 <= lottery_5 &
                                             lottery_5 <= lottery_6 & 
                                             lottery_6 <= lottery_7) == TRUE, risk_averse1, NA),
                    #PSM
                    aps1 = PSM_1, aps2 = PSM_2, aps3 = PSM_3, aps4 = PSM_4,
                    cpv1 = PSM_5, cpv2 = PSM_6, cpv3 = PSM_7, cpv4 = PSM_8,
                    com1 = PSM_9, com2 = PSM_10, com3 = PSM_11, com4 = PSM_12,
                    ss1 = PSM_13, ss2 = PSM_14, ss3 = PSM_15, ss4 = PSM_16,
                    aps = (aps1 + aps2 + aps3 + aps4)/4,
                    cpv = (cpv1 + cpv2 + cpv3 + cpv4)/4,
                    com = (com1 + com2 + com3 + com4)/4,
                    ss = (ss1 + ss2 + ss3 + ss4)/4,
                    psm = (aps + cpv + com + ss)/4,
                    psm_norm = fxn_norm(psm),
                    aps_norm = fxn_norm(aps), cpv_norm = fxn_norm(cpv),
                    com_norm = fxn_norm(com), ss_norm = fxn_norm(ss),
                    #DEMOGRAPHIC/ATTITUDINAL VARIABLES
                    job_sect, age, male, olympiad, gpa, family_income,
                    city_size, religion, religion_services,
                    religion_denom = recode(religion_denom, `1` = "Orthodox Christianity", `2` =  "Protestantism",
                                            `3` = "Catholicism", `4` = "Islam", `5` = "Judaism", `6` = "Buddhism",
                                            `7` = "Other"),
                    gpa_norm = fxn_norm(gpa),
                    family_inc_norm = fxn_norm(family_income),
                    city_size_norm = fxn_norm(city_size),
                    ege_taken = EGE_taken,
                    ege_math = as.numeric(EGE_3_TEXT),
                    ege_ru = na_if(EGE_10_TEXT, "-"),
                    ege_ru = as.numeric(ege_ru),
                    ege_avg = (ege_math + ege_ru)/2,
                    region1 = recode(region, `1` = "Moscow", `44` = "Moscow", `99` = "Foreign", .default = "Regions"),
                    region2 = plyr::mapvalues(region, 
                                              from = c(3, 8, 14, 25, 34, 57, #southern
                                                       1, 10, 11, 13, 16, 20, 26, 38, 40, 44, 52, 58, 66, 68, 70, 72, 85,#central
                                                       9, 31, 43, 47, 51, 53, 54, 59, 60, 75, 76, 82, 69,#volga
                                                       65, 67, #n caucasus
                                                       36, #crimea
                                                       2, 7, 15, 24, 29, 32, 48, #northwest
                                                       63, 74, 79, 80, 84, #ural
                                                       5, 12, 22, 30, 35, 49, 50, 73, 78, #siberia
                                                       6, 27, 41, 55, 62, 77, 61, #far east
                                                       99),#foreign 
                                              to = c(rep("Southern", 6), rep("Central", 17), rep("Volga", 13), rep("N. Caucasus", 2),
                                                     rep("Crimea", 1), rep("Northwest", 7), rep("Ural", 5), rep("Siberia", 9), 
                                                     rep("Far East", 7), rep("Foreign", 1))),
                    f_work_fed = relatives_work_1_1, m_work_fed = relatives_work_1_2, r_work_fed = relatives_work_1_3,
                    f_work_reg = relatives_work_2_1, m_work_reg = relatives_work_2_2, r_work_reg = relatives_work_2_3,
                    f_work_gov = relatives_work_3_1, m_work_gov = relatives_work_3_2, r_work_gov = relatives_work_3_3,
                    f_work_corp = relatives_work_4_1, m_work_corp = relatives_work_4_2, r_work_corp = relatives_work_4_3,
                    f_work_sme = relatives_work_5_1, m_work_sme = relatives_work_5_2, r_work_sme = relatives_work_5_3,
                    f_work_owner = relatives_work_6_1, m_work_owner = relatives_work_6_2, r_work_owner = relatives_work_6_3,
                    f_work_finance = relatives_work_7_1, m_work_finance = relatives_work_7_2, r_work_finance = relatives_work_7_3,
                    f_work_ngo = relatives_work_8_1, m_work_ngo = relatives_work_8_2, r_work_ngo = relatives_work_8_3,
                    f_work_law = relatives_work_9_1, m_work_law = relatives_work_9_2, r_work_law = relatives_work_9_3,
                    f_work_mil = relatives_work_10_1, m_work_mil = relatives_work_10_2, r_work_mil = relatives_work_10_3,
                    f_work_other = relatives_work_11_1, m_work_other = relatives_work_11_2, r_work_other = relatives_work_11_3,
                    f_no_work = relatives_work_12_1, m_no_work = relatives_work_12_2, r_no_work = relatives_work_12_3,
                    f_work_na = relatives_work_13_1, m_work_na = relatives_work_13_2, r_work_na = relatives_work_13_3,
                    f_pubsect = ifelse(f_work_fed == 1 | f_work_reg == 1 | f_work_gov == 1, 1, 0),
                    m_pubsect = ifelse(m_work_fed == 1 | m_work_reg == 1 | m_work_gov == 1, 1, 0),
                    r_pubsect = ifelse(r_work_fed == 1 | r_work_reg == 1 | r_work_gov == 1, 1, 0),
                    rel_pubsect = ifelse(f_pubsect == 1 | m_pubsect == 1 | r_pubsect == 1, 1, 0),
                    parent_pubsect = ifelse(f_work_fed == 1 | f_work_reg == 1 | f_work_gov == 1 | m_work_fed == 1 | m_work_reg == 1 | m_work_gov == 1, 1, 0), 
                    parent_privsect = ifelse(f_work_corp == 1 | f_work_sme == 1 | f_work_owner == 1 | f_work_finance == 1 | m_work_corp == 1 | m_work_sme == 1 | m_work_owner == 1 | m_work_finance == 1, 1, 0), 
                    parent_ngo = ifelse(f_work_ngo == 1 | m_work_ngo == 1, 1, 0),
                    parent_mil = ifelse(f_work_mil == 1 | m_work_mil == 1, 1, 0),
                    parent_legal = ifelse(f_work_law == 1 | m_work_law == 1, 1, 0),
                    #collapsing all MA students together
                    class_yr = recode(class_yr, `6` = 5),
                    #integrating write-in info on departments
                    dep = ifelse(dep == 5 & is.na(dep_text) == FALSE, dep_text, dep),
                    dep = recode_factor(dep, `1` = "PA", `2` = "PS", `3` = "Soc", `4` = "Psych", `5` = "Other"),
                    dep2 = recode_factor(dep, "Law" = "Other", "Humanities" = "Other", "CS" = "Other", "Math" = "Other", "Intl" = "Other", "Urban" = "Other", "Psych" = "Other"),
                    dep_PA = ifelse(is.na(dep) == F & dep == "PA", 1, 0),
                    #QUALITY CHECK VARIABLES
                    randomid, soc_sci_knowledge = soc_science_knowledg, device,
                    #time to complete
                    duration = round(difftime(end_time, start_time, units=c("auto","secs","mins","hours","days","weeks")), 0),
                    #screeners
                    screener1 = ifelse(is.na(S1_27) == F & is.na(S1_29) == F, 1, 0),
                    screener2 = ifelse(is.na(S2_5) == F & is.na(S2_10) == F, 1, 0),
                    screener = screener1 + screener2
)

moscow_dta$study <- rep("Moscow", nrow(moscow_dta))

MoscowClean_DF <- as.data.frame(moscow_dta)

#write.csv(MoscowClean_DF, "MoscowClean.csv", na = "", row.names = F)






###REGIONAL RUSSIAN RESEARCH SITE
rm(list = ls())

fxn_norm = function(x) {(x - min(x, na.rm = T))/(max(x, na.rm = T) - min(x, na.rm = T))}

region_raw <- read_excel("RegionDataRaw.xlsx")


region_dta_temp <- region_raw %>%
            select(start_time = V8, end_time = V9, randomid = RandomID,
                   matches("S1|S2", ignore.case = FALSE),
                   matches("G1|G2|G3", ignore.case = FALSE),
                   contains("lottery"), contains("PSM"), 
                   job_sect,
                   age, male = gender, class_yr,
                   dep = fakultet,
                   gpa = GPA, olympiad, contains("EGE"), region,
                   city_size, contains("religion"), family_income,
                   contains("relatives"), 
                   soc_science_knowledg, device, 
                   Q119, Q120, Q121, Q122_6
  )



#PRELIMINARY VARIABLE CONSTRUCTION FOR DICE TASK GAME

#changing variables names of individual virtual dice rolls
#g1.1...g1.30 are the first set of 20 dice rolls (each respondent sees only 20 of these 30 dice profiles)
#g2.1...g2.30 are the second set of 20 dice rolls (each respondent sees only 20 of these 30 dice profiles)

names(region_dta_temp) = gsub(pattern = "\\(", replacement = "", x = names(region_dta_temp)) 
names(region_dta_temp) = gsub(pattern = "\\)", replacement = "", x = names(region_dta_temp)) 
names(region_dta_temp) = gsub(pattern = "G2v1_T", replacement = "g1.", x = names(region_dta_temp)) 
names(region_dta_temp) = gsub(pattern = "G2v2_T", replacement = "g2.", x = names(region_dta_temp)) 
region_dta_temp <- region_dta_temp %>% dplyr::rename(g1.25 = G2v1F_T1, g1.26 = G2v1F_T2,  g1.27 = G2v1F_T3,  g1.28 = G2v1F_T4, g1.29 = G2v1F_T5 , g1.30 = G2v1F_T6)
region_dta_temp <- region_dta_temp %>% dplyr::rename(g2.25 = G2v2F_T1, g2.26 = G2v2F_T2,  g2.27 = G2v2F_T3,  g2.28 = G2v2F_T4, g2.29 = G2v2F_T5 , g2.30 = G2v2F_T6)

# correct guesses are already coded as 1, incorrect guesses as NA; changing here NA to 0

region_dta_temp <- region_dta_temp %>% mutate_at(vars(starts_with('g', ignore.case = F), -contains("gpa")), list(~replace_na(., 0)))

# correct_guesses1 is sum of correct guesses in first 20 dice rolls
dice_rolls1 <- region_dta_temp %>% select(starts_with('g1.', ignore.case = F))
region_dta_temp$correct_guesses1 <- rowSums(dice_rolls1)   

# correct_guesses2 is sum of correct guesses in second 20 dice rolls
dice_rolls2 <- region_dta_temp %>% select(starts_with('g2.', ignore.case = F))
region_dta_temp$correct_guesses2 <- rowSums(dice_rolls2)  


#PRELIMINARY VARIABLE CONSTRUCTION FOR BRIBERY GAME

region_dta_temp$bribe_accept_size1 = with(region_dta_temp,
                                          ifelse(G3_bur_br_350 > G3_bur_br_300, 7,
                                                 ifelse(G3_bur_br_300 > G3_bur_br_250, 6,
                                                        ifelse(G3_bur_br_250 > G3_bur_br_200, 5,
                                                               ifelse(G3_bur_br_200 > G3_bur_br_150, 4,
                                                                      ifelse(G3_bur_br_150 > G3_bur_br_100, 3,
                                                                             ifelse(G3_bur_br_100 > G3_bur_br_50, 2,
                                                                                    ifelse(G3_bur_br_50 > 0, 1, 0)))))))
)

#this version removes subjects with non-transitive preferences
region_dta_temp$bribe_accept_size2 = with(region_dta_temp,
                                          ifelse((G3_bur_br_50 <= G3_bur_br_100 & 
                                                    G3_bur_br_100 <= G3_bur_br_150 & 
                                                    G3_bur_br_150 <= G3_bur_br_200 & 
                                                    G3_bur_br_200 <= G3_bur_br_250 &
                                                    G3_bur_br_250 <= G3_bur_br_300 & 
                                                    G3_bur_br_300 <= G3_bur_br_350) == TRUE, bribe_accept_size1, NA)
)


#OTHER PRELIMINARY CLEANING

region_dta_temp <- region_dta_temp %>% mutate_at(vars(contains("_work_")), list(~replace_na(., 0)))



#CREATING DATA SET

region_dta <- region_dta_temp %>% transmute(
                        #DICTATOR GAME
                        donate = G1_donate_1, 
                        #converts to ruble amount
                        donate25 = donate*25,
                        donate_norm = fxn_norm(donate),
                        #DICE TASK GAME
                        correct_guesses1, correct_guesses2,
                        correct_guesses = correct_guesses1 + correct_guesses2,
                        cheat_rate = (1/(1 - (1/6))) * 1/40 * correct_guesses - (1/6)/(1 - (1/6)),
                        #BRIBERY GAME
                        bribe_accept_size1,
                        bribe_accept_size2,
                        bribe_offer_size = G3_citizen_1,
                        bribe_offer = ifelse(bribe_offer_size > 0, 1, 0),
                        bribe_accept = ifelse(bribe_accept_size1 > 0, 1, 0),
                        bribe = ifelse(is.na(bribe_offer) == FALSE, bribe_offer, bribe_accept),
                        citizen_nobribe = G3_citizen_nobribe,
                        bur_nobribe = G3_bur_nobribe,
                        #RISK AVERSION MEASURE
                        risk_averse1 = 8 - (lottery_1 + lottery_2 + lottery_3 + lottery_4 + lottery_5 + lottery_6 + lottery_7),
                        risk_averse_norm = fxn_norm(risk_averse1),
                        #this version removes subjects with non-transitive preferences
                        risk_averse2 = ifelse((lottery_1 <= lottery_2 & 
                                                 lottery_2 <= lottery_3 & 
                                                 lottery_3 <= lottery_4 &
                                                 lottery_4 <= lottery_5 &
                                                 lottery_5 <= lottery_6 & 
                                                 lottery_6 <= lottery_7) == TRUE, risk_averse1, NA),
                        #PSM
                        aps1 = PSM_1, aps2 = PSM_2, aps3 = PSM_3, aps4 = PSM_4,
                        cpv1 = PSM_5, cpv2 = PSM_6, cpv3 = PSM_7, cpv4 = PSM_8,
                        com1 = PSM_9, com2 = PSM_10, com3 = PSM_11, com4 = PSM_12,
                        ss1 = PSM_13, ss2 = PSM_14, ss3 = PSM_15, ss4 = PSM_16,
                        aps = (aps1 + aps2 + aps3 + aps4)/4,
                        cpv = (cpv1 + cpv2 + cpv3 + cpv4)/4,
                        com = (com1 + com2 + com3 + com4)/4,
                        ss = (ss1 + ss2 + ss3 + ss4)/4,
                        psm = (aps + cpv + com + ss)/4,
                        psm_norm = fxn_norm(psm),
                        aps_norm = fxn_norm(aps), cpv_norm = fxn_norm(cpv),
                        com_norm = fxn_norm(com), ss_norm = fxn_norm(ss),
                        #DEMOGRAPHIC/ATTITUDINAL VARIABLES
                        job_sect, age, male, olympiad, gpa, family_income,
                        city_size, religion, religion_services,
                        religion_denom = recode(religion_denom, `1` = "Orthodox Christianity", `2` =  "Protestantism",
                                                `3` = "Catholicism", `4` = "Islam", `5` = "Judaism", `6` = "Buddhism",
                                                `7` = "Other"),
                        gpa_norm = fxn_norm(gpa),
                        family_inc_norm = fxn_norm(family_income),
                        city_size_norm = fxn_norm(city_size),
                        ege_taken = EGE_taken,
                        ege_math = as.numeric(EGE_3_TEXT),
                        ege_ru = as.numeric(EGE_10_TEXT),
                        ege_avg = (ege_math + ege_ru)/2,
                        region1 = recode(region, `63` = "Regional Capital", `74` = "Oblast (minus capital)", `79` = "Oblast (minus capital)", 
                                        `80` = "Oblast (minus capital)", `84` = "Oblast (minus capital)", .default = "Other"),
                        f_work_fed = relatives_work_1_1, m_work_fed = relatives_work_1_2, r_work_fed = relatives_work_1_3,
                        f_work_reg = relatives_work_2_1, m_work_reg = relatives_work_2_2, r_work_reg = relatives_work_2_3,
                        f_work_gov = relatives_work_3_1, m_work_gov = relatives_work_3_2, r_work_gov = relatives_work_3_3,
                        f_work_corp = relatives_work_4_1, m_work_corp = relatives_work_4_2, r_work_corp = relatives_work_4_3,
                        f_work_sme = relatives_work_5_1, m_work_sme = relatives_work_5_2, r_work_sme = relatives_work_5_3,
                        f_work_owner = relatives_work_6_1, m_work_owner = relatives_work_6_2, r_work_owner = relatives_work_6_3,
                        f_work_finance = relatives_work_7_1, m_work_finance = relatives_work_7_2, r_work_finance = relatives_work_7_3,
                        f_work_ngo = relatives_work_8_1, m_work_ngo = relatives_work_8_2, r_work_ngo = relatives_work_8_3,
                        f_work_law = relatives_work_9_1, m_work_law = relatives_work_9_2, r_work_law = relatives_work_9_3,
                        f_work_mil = relatives_work_10_1, m_work_mil = relatives_work_10_2, r_work_mil = relatives_work_10_3,
                        f_work_other = relatives_work_11_1, m_work_other = relatives_work_11_2, r_work_other = relatives_work_11_3,
                        f_no_work = relatives_work_12_1, m_no_work = relatives_work_12_2, r_no_work = relatives_work_12_3,
                        f_work_na = relatives_work_13_1, m_work_na = relatives_work_13_2, r_work_na = relatives_work_13_3,
                        f_pubsect = ifelse(f_work_fed == 1 | f_work_reg == 1 | f_work_gov == 1, 1, 0),
                        m_pubsect = ifelse(m_work_fed == 1 | m_work_reg == 1 | m_work_gov == 1, 1, 0),
                        r_pubsect = ifelse(r_work_fed == 1 | r_work_reg == 1 | r_work_gov == 1, 1, 0),
                        rel_pubsect = ifelse(f_pubsect == 1 | m_pubsect == 1 | r_pubsect == 1, 1, 0),
                        parent_pubsect = ifelse(f_work_fed == 1 | f_work_reg == 1 | f_work_gov == 1 | m_work_fed == 1 | m_work_reg == 1 | m_work_gov == 1, 1, 0), 
                        parent_privsect = ifelse(f_work_corp == 1 | f_work_sme == 1 | f_work_owner == 1 | f_work_finance == 1 | m_work_corp == 1 | m_work_sme == 1 | m_work_owner == 1 | m_work_finance == 1, 1, 0), 
                        parent_ngo = ifelse(f_work_ngo == 1 | m_work_ngo == 1, 1, 0),
                        parent_mil = ifelse(f_work_mil == 1 | m_work_mil == 1, 1, 0),
                        parent_legal = ifelse(f_work_law == 1 | m_work_law == 1, 1, 0),
                        #collapsing all MA students together
                        class_yr = recode(class_yr, `6` = 5),
                        #field of study
                        dep = recode_factor(dep, `1` = "IGUP_PubAdmin", `2` = "IGUP_Comm", `3` = "IGUP_Trade", 
                                            `9` = "Other", `10` = "IGUP_EconSec", `11`= "IGUP_PubPol",
                                            `12` = "IGUP_Management", `13` = "VShEM_Econ", `14` = "VShEM_Management",
                                            `15` = "VShEM_Customs", `16` = "VShEM_BusCS", `17` = "VShEM_EconSec",
                                            `18` = "VShEM_SocPol", `19` = "VShEM_AppliedCS", `20` = "VShEM_Finance",
                                            `22` = "UGI_Pol", `23` = "UGI_Soc", `24` = "UGI_HR", `26` = "UGI_IR"),
                        dep2 = case_when(
                          grepl("IGUP", dep) ~ "IGUP",
                          grepl("VShEM", dep) ~ "VShEM",
                          grepl("UGI|Other", dep) ~ "Other"),
                        dep_PA = ifelse(dep2 == "IGUP", 1, 0),
                        #QUALITY CHECK VARIABLES
                        randomid, soc_sci_knowledge = soc_science_knowledg, device,
                        #time to complete
                        duration = round(difftime(end_time, start_time, units=c("auto","secs","mins","hours","days","weeks")), 0),
                        #screeners
                        screener1 = ifelse(is.na(S1_27) == F & is.na(S1_29) == F, 1, 0),
                        screener2 = ifelse(is.na(S2_24) == F, 1, 0),
                        screener = screener1 + screener2
)


region_dta$study <- rep("Region", nrow(region_dta))

RegionClean_DF <- as.data.frame(region_dta)

#write.csv(RegionClean_DF, "RegionClean.csv", na = "", row.names = F)





###UKRAINE RESEARCH SITE
rm(list = ls())

fxn_norm = function(x) {(x - min(x, na.rm = T))/(max(x, na.rm = T) - min(x, na.rm = T))}

ukraine_raw <- read_excel("UkraineDataRaw.xlsx")

ukraine_temp <- ukraine_raw %>%
            select(start_time = V8, end_time = V9, randomid = RandomID,
                   matches("S1|S2", ignore.case = FALSE), lab, session,
                   matches("G1|G2|G3", ignore.case = FALSE),
                   contains("lottery"), contains("PSM"),
                   job_sect,
                   age, male = gender, class_yr, fakultet,
                   gpa = GPA, olympiad, contains("EGE"), region,
                   city_size, contains("religion"), family_income,
                   contains("relatives"), ethnic, lang,
                   soc_science_knowledg
  )


#PRELIMINARY VARIABLE CONSTRUCTION FOR DICE TASK GAME

#changing variables names of individual virtual dice rolls
#g1.1...g1.30 are the first set of 20 dice rolls (each respondent sees only 20 of these 30 dice profiles)
#g2.1...g2.30 are the second set of 20 dice rolls (each respondent sees only 20 of these 30 dice profiles)

names(ukraine_temp) = gsub(pattern = "\\(", replacement = "", x = names(ukraine_temp)) 
names(ukraine_temp) = gsub(pattern = "\\)", replacement = "", x = names(ukraine_temp)) 
names(ukraine_temp) = gsub(pattern = "G2v1_T", replacement = "g1.", x = names(ukraine_temp)) 
names(ukraine_temp) = gsub(pattern = "G2v2_T", replacement = "g2.", x = names(ukraine_temp)) 
ukraine_temp <- ukraine_temp %>% dplyr::rename(g1.25 = G2v1F_T1, g1.26 = G2v1F_T2,  g1.27 = G2v1F_T3,  g1.28 = G2v1F_T4, g1.29 = G2v1F_T5 , g1.30 = G2v1F_T6)
ukraine_temp <- ukraine_temp %>% dplyr::rename(g2.25 = G2v2F_T1, g2.26 = G2v2F_T2,  g2.27 = G2v2F_T3,  g2.28 = G2v2F_T4, g2.29 = G2v2F_T5 , g2.30 = G2v2F_T6)

# correct guesses are already coded as 1, incorrect guesses as NA; changing here NA to 0
#ukraine_temp <- ukraine_temp %>% mutate_at(vars(starts_with('g', ignore.case = F), -contains("gpa")), funs(replace_na(., 0)))
#replaced the above line with the line below b/c "funs" deprecated
ukraine_temp <- ukraine_temp %>% mutate_at(vars(starts_with('g', ignore.case = F), -contains("gpa")), list(~replace_na(., 0)))

# correct_guesses1 is sum of correct guesses in first 20 dice rolls
dice_rolls1 <- ukraine_temp %>% select(starts_with('g1.', ignore.case = F))
ukraine_temp$correct_guesses1 <- rowSums(dice_rolls1)   

# correct_guesses2 is sum of correct guesses in second 20 dice rolls
dice_rolls2 <- ukraine_temp %>% select(starts_with('g2.', ignore.case = F))
ukraine_temp$correct_guesses2 <- rowSums(dice_rolls2)  


#PRELIMINARY VARIABLE CONSTRUCTION FOR BRIBERY GAME

ukraine_temp$bribe_accept_size1 = with(ukraine_temp,
                                       ifelse(G3_bur_br_350 > G3_bur_br_300, 7,
                                              ifelse(G3_bur_br_300 > G3_bur_br_250, 6,
                                                     ifelse(G3_bur_br_250 > G3_bur_br_200, 5,
                                                            ifelse(G3_bur_br_200 > G3_bur_br_150, 4,
                                                                   ifelse(G3_bur_br_150 > G3_bur_br_100, 3,
                                                                          ifelse(G3_bur_br_100 > G3_bur_br_50, 2,
                                                                                 ifelse(G3_bur_br_50 > 0, 1, 0)))))))
)

#this version removes subjects with non-transitive preferences
ukraine_temp$bribe_accept_size2 = with(ukraine_temp,
                                       ifelse((G3_bur_br_50 <= G3_bur_br_100 & 
                                                 G3_bur_br_100 <= G3_bur_br_150 & 
                                                 G3_bur_br_150 <= G3_bur_br_200 & 
                                                 G3_bur_br_200 <= G3_bur_br_250 &
                                                 G3_bur_br_250 <= G3_bur_br_300 & 
                                                 G3_bur_br_300 <= G3_bur_br_350) == TRUE, bribe_accept_size1, NA)
)

#OTHER PRELIMINARY CLEANING

ukraine_temp <- ukraine_temp %>% mutate_at(vars(contains("_work_")), list(~replace_na(., 0)))



#CREATING DATA SET

ukraine_dta <- ukraine_temp %>% transmute(
                        #DICTATOR GAME
                        donate = G1_donate_1, 
                        #converts to hryvnia amount
                        donate5 = donate*5,
                        donate_norm = fxn_norm(donate),
                        #DICE TASK GAME
                        correct_guesses1, correct_guesses2,
                        correct_guesses = correct_guesses1 + correct_guesses2,
                        cheat_rate = (1/(1 - (1/6))) * 1/40 * correct_guesses - (1/6)/(1 - (1/6)),
                        #BRIBERY GAME
                        bribe_accept_size1,
                        bribe_accept_size2,
                        bribe_offer_size = G3_citizen_1,
                        bribe_offer = ifelse(bribe_offer_size > 0, 1, 0),
                        bribe_accept = ifelse(bribe_accept_size1 > 0, 1, 0),
                        bribe = ifelse(is.na(bribe_offer) == FALSE, bribe_offer, bribe_accept),
                        citizen_nobribe = G3_citizen_nobribe,
                        bur_nobribe = G3_bur_nobribe,
                        #RISK AVERSION MEASURE
                        risk_averse1 = 8 - (lottery_1 + lottery_2 + lottery_3 + lottery_4 + lottery_5 + lottery_6 + lottery_7),
                        risk_averse_norm = fxn_norm(risk_averse1),
                        #this version removes subjects with non-transitive preferences
                        risk_averse2 = ifelse((lottery_1 <= lottery_2 & 
                                                 lottery_2 <= lottery_3 & 
                                                 lottery_3 <= lottery_4 &
                                                 lottery_4 <= lottery_5 &
                                                 lottery_5 <= lottery_6 & 
                                                 lottery_6 <= lottery_7) == TRUE, risk_averse1, NA),
                        #PSM
                        aps1 = PSM_1, aps2 = PSM_2, aps3 = PSM_3, aps4 = PSM_4,
                        cpv1 = PSM_5, cpv2 = PSM_6, cpv3 = PSM_7, cpv4 = PSM_8,
                        com1 = PSM_9, com2 = PSM_10, com3 = PSM_11, com4 = PSM_12,
                        ss1 = PSM_13, ss2 = PSM_14, ss3 = PSM_15, ss4 = PSM_16,
                        aps = (aps1 + aps2 + aps3 + aps4)/4,
                        cpv = (cpv1 + cpv2 + cpv3 + cpv4)/4,
                        com = (com1 + com2 + com3 + com4)/4,
                        ss = (ss1 + ss2 + ss3 + ss4)/4,
                        psm = (aps + cpv + com + ss)/4,
                        psm_norm = fxn_norm(psm),
                        aps_norm = fxn_norm(aps), cpv_norm = fxn_norm(cpv),
                        com_norm = fxn_norm(com), ss_norm = fxn_norm(ss),
                        # DEMOGRAPHIC/ATTITUDINAL VARIABLES
                        job_sect, age, male, olympiad, 
                        region1 = recode(region, `1` = "Kyiv City", `3` = "Crimea",
                                        `4` = "Vinnytsia", `5` = "Volyn", `6` = "Dnipro", `7` = "Donetsk",
                                        `8` = "Zhytomyr", `9` = "Zakarpattia", `10` = "Zaporizhia",
                                        `11` = "Ivano-Frankivsk", `12` = "Kyiv", `13` = "Kirovohrad",
                                        `14` = "Luhansk", `15` = "Lviv" ,`16` = "Mykolaiv",
                                        `17` = "Odessa", `18` = "Poltava", `19` = "Rivne", 
                                        `20` = "Sumy", `21` = "Ternopil", `22` = "Kharkiv", 
                                        `23` = "Kherson", `24` = "Khmelnytskyi", 
                                        `25` = "Cherkasy", `26` = "Chernihiv", `27` = "Chernivtsi",
                                        `28` = "Abroad"),
                        region2 = na_if(region1, "Abroad"),
                        region2 = recode(region2, `Cherkasy` = "Central", `Chernihiv` = "Central",
                                         `Kyiv` = "Central", `Kyiv City` = "Central", `Kirovohrad` = "Central",
                                         `Poltava` = "Central", `Sumy` = "Central", `Vinnytsia` = "Central",
                                         `Zhytomyr` = "Central", `Donetsk` = "East", `Kharkiv` = "East",
                                         `Luhansk` = "East", `Dnipro` = "East", `Odessa` = "South",
                                         `Mykolaiv` = "South", `Kherson` = "South", `Zaporizhia` = "South",
                                         `Crimea` = "South", `Chernivtsi` = "West",
                                         `Ivano-Frankivsk` = "West", `Khmelnytskyi` = "West",
                                         `Lviv` = "West", `Rivne` = "West", "Ternopil" = "West",
                                         `Volyn` = "West", `Zakarpattia` = "West"),
                        family_income = na_if(family_income, 99),
                        city_size, religion, religion_services,
                        religion_denom = recode(religion_denom, `1` = "Orthodox", `2` =  "Ukrainian Orthodox - Kyiv",
                                                `3` = "Ukrainian Orthodox - Moscow", `4` = "Ukrainian Autocephalous Orthodo", `5` = "Uniate", 
                                                `6` = "Islam", `7` = "Protestantism", `8` = "Catholicism", `9` =  "Judaism", `10` = "Other"),
                        family_inc_norm = fxn_norm(family_income),
                        city_size_norm = fxn_norm(city_size),
                        #correcting for error in Qualtrics variable coding
                        gpa = dplyr::recode(gpa, `10` = 1, `1` = 2, `2` = 3, `3` = 4, `4` = 5, `5` = 6),
                        gpa_norm = fxn_norm(gpa),
                        zno_taken = EGE_taken,
                        zno_math = na_if(EGE_3_TEXT, "-"),
                        zno_math = na_if(zno_math, 0),
                        zno_math = as.numeric(gsub(",", ".", zno_math)),
                        zno_ukr = as.numeric(gsub(",", ".", EGE_10_TEXT)),
                        zno_law = na_if(EGE_18_TEXT, "-"),
                        zno_law = na_if(zno_law, 0),
                        zno_law = as.numeric(gsub(",", ".", zno_law)),
                        lang = recode(lang, `1` = "Russian", `2` = "Ukrainian", `3` = "Both", `4` = "Other"),
                        ethnic = recode(ethnic, `1` = "Ukrainian", `2` = "Russian", `3` = "Tatar", `4` = "Bulgar",
                                        `5` = "Polish", `6` = "Moldovan", `7` = "Belarussian", `8` = "Jewish", `9` = "Other"),
                        f_work_pros = relatives_work_50_1, m_work_pros = relatives_work_50_2, r_work_pros = relatives_work_50_3,
                        f_work_court = relatives_work_51_1, m_work_court = relatives_work_51_2, r_work_court = relatives_work_51_3,
                        f_work_law = relatives_work_52_1, m_work_law = relatives_work_52_2, r_work_law = relatives_work_52_3,
                        f_work_fed = relatives_work_1_1, m_work_fed = relatives_work_1_2, r_work_fed = relatives_work_1_3,
                        f_work_reg = relatives_work_2_1, m_work_reg = relatives_work_2_2, r_work_reg = relatives_work_2_3,
                        f_work_gov = relatives_work_3_1, m_work_gov = relatives_work_3_2, r_work_gov = relatives_work_3_3,
                        f_work_corp = relatives_work_4_1, m_work_corp = relatives_work_4_2, r_work_corp = relatives_work_4_3,
                        f_work_sme = relatives_work_5_1, m_work_sme = relatives_work_5_2, r_work_sme = relatives_work_5_3,
                        f_work_owner = relatives_work_6_1, m_work_owner = relatives_work_6_2, r_work_owner = relatives_work_6_3,
                        f_work_finance = relatives_work_7_1, m_work_finance = relatives_work_7_2, r_work_finance = relatives_work_7_3,
                        f_work_ngo = relatives_work_8_1, m_work_ngo = relatives_work_8_2, r_work_ngo = relatives_work_8_3,
                        f_work_law2 = relatives_work_9_1, m_work_law2 = relatives_work_9_2, r_work_law2 = relatives_work_9_3,
                        f_work_mil = relatives_work_10_1, m_work_mil = relatives_work_10_2, r_work_mil = relatives_work_10_3,
                        f_work_other = relatives_work_11_1, m_work_other = relatives_work_11_2, r_work_other = relatives_work_11_3,
                        f_no_work = relatives_work_12_1, m_no_work = relatives_work_12_2, r_no_work = relatives_work_12_3,
                        f_work_na = relatives_work_13_1, m_work_na = relatives_work_13_2, r_work_na = relatives_work_13_3,
                        f_pubsect = ifelse(f_work_fed == 1 | f_work_reg == 1 | f_work_gov == 1, 1, 0),
                        m_pubsect = ifelse(m_work_fed == 1 | m_work_reg == 1 | m_work_gov == 1, 1, 0),
                        r_pubsect = ifelse(r_work_fed == 1 | r_work_reg == 1 | r_work_gov == 1, 1, 0),
                        rel_pubsect = ifelse(f_pubsect == 1 | m_pubsect == 1 | r_pubsect == 1, 1, 0),
                        parent_pubsect = ifelse(f_work_fed == 1 | f_work_reg == 1 | f_work_gov == 1 | m_work_fed == 1 | m_work_reg == 1 | m_work_gov == 1, 1, 0), 
                        parent_privsect = ifelse(f_work_corp == 1 | f_work_sme == 1 | f_work_owner == 1 | f_work_finance == 1 | m_work_corp == 1 | m_work_sme == 1 | m_work_owner == 1 | m_work_finance == 1, 1, 0), 
                        parent_ngo = ifelse(f_work_ngo == 1 | m_work_ngo == 1, 1, 0),
                        parent_mil = ifelse(f_work_mil == 1 | m_work_mil == 1, 1, 0),
                        parent_legal = ifelse(f_work_law == 1 | f_work_law2 == 1 | f_work_pros == 1 | f_work_court == 1 | m_work_law == 1 | m_work_law2 == 1 | m_work_pros == 1 | m_work_court == 1, 1, 0),
                        #collapsing all MA students together
                        class_yr = recode(class_yr, `6` = 5),
                        #dep
                        dep = fakultet,  
                        dep_publaw = ifelse(dep == 1 | dep == 2 | dep == 4 | dep == 5 , 1, 0),
                        dep_tri = recode(dep, `1` = "Pub. Law", `2` = "Pub. Law", `3` = "Priv. Law",
                                         `4` = "Pub. Law", `5` = "Pub. Law", `6` = "Priv. Law",
                                         `7` = "Soc. Science & Other", `8` = "Soc. Science & Other", 
                                         `9` = "Soc. Science & Other", `10` = "Soc. Science & Other",
                                         `14`= "Soc. Science & Other"),
                        dep = recode(dep, `1` = "Inst. Criminal Justice", `2` = "Investigator Training Dep.",
                                     `3` = "Commercial Law Dep. ", `4` = "Judicial-Admin Dep.",
                                     `5` = "Social Law Dep.", `6` = "Advokaty Dep.", `7` = "Soc. Science Dep.",
                                     `8` = "Journalism Dep.", `9` = "Other", `10` = "Other", `14` = "Other"),
                        #QUALITY CHECK VARIABLES
                        randomid, lab, session,
                        soc_sci_knowledge = soc_science_knowledg,
                        #time to complete
                        duration = round(difftime(end_time, start_time, units=c("auto","secs","mins","hours","days","weeks")), 0),
                        #screeners
                        screener1 = ifelse(is.na(S1_27) == F & is.na(S1_29) == F, 1, 0),
                        screener2 = ifelse(is.na(S2_8) == F & is.na(S2_12) == F, 1, 0),
                        screener = screener1 + screener2
)

ukraine_dta$study <- rep("Ukraine", nrow(ukraine_dta))


#converting tibble to dataframe
UkraineClean_DF <- as.data.frame(ukraine_dta)

#write.csv(UkraineClean_DF, "UkraineClean.csv", na = "", row.names = F)


########MERGING ALL RESEARCH SITES

rm(list = ls())

moscow_clean = read_csv("MoscowClean.csv")
region_clean = read_csv("RegionClean.csv")
ukraine_clean = read_csv("UkraineClean.csv")

AllSites_dta_temp = full_join(moscow_clean, region_clean)
AllSites_dta = full_join(AllSites_dta_temp, ukraine_clean)

#converting tibble to dataframe
AllSites_DF <- as.data.frame(AllSites_dta)

#write.csv(AllSites_DF, "AllSitesClean.csv", na = "", row.names = F)

